packages <- c("tidyverse", "kableExtra", "broom", "broomExtra", "gganimate")
xfun::pkg_attach(packages, message = F)
nice_tables <- function(table) {
table %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "100%", height = "200px")
}
Let’s first look at the regular season dataset.
df_regular_season <- read_csv("data/RegularSeasonDetailedResults.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## WLoc = col_character()
## )
## See spec(...) for full column specifications.
glimpse(df_regular_season)
## Observations: 82,041
## Variables: 34
## $ Season <dbl> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 20…
## $ DayNum <dbl> 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, …
## $ WTeamID <dbl> 1104, 1272, 1266, 1296, 1400, 1458, 1161, 1186, 1194, 14…
## $ WScore <dbl> 68, 70, 73, 56, 77, 81, 80, 75, 71, 84, 106, 74, 66, 76,…
## $ LTeamID <dbl> 1328, 1393, 1437, 1457, 1208, 1186, 1236, 1457, 1156, 12…
## $ LScore <dbl> 62, 63, 61, 50, 71, 55, 62, 61, 66, 56, 50, 73, 65, 48, …
## $ WLoc <chr> "N", "N", "N", "N", "N", "H", "H", "N", "N", "H", "H", "…
## $ NumOT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ WFGM <dbl> 27, 26, 24, 18, 30, 26, 23, 28, 28, 32, 41, 29, 26, 25, …
## $ WFGA <dbl> 58, 62, 58, 38, 61, 57, 55, 62, 58, 67, 69, 51, 66, 56, …
## $ WFGM3 <dbl> 3, 8, 8, 3, 6, 6, 2, 4, 5, 5, 15, 7, 5, 10, 11, 10, 5, 8…
## $ WFGA3 <dbl> 14, 20, 18, 9, 14, 12, 8, 14, 11, 17, 25, 13, 19, 23, 31…
## $ WFTM <dbl> 11, 10, 17, 17, 11, 23, 32, 15, 10, 15, 9, 9, 9, 16, 12,…
## $ WFTA <dbl> 18, 19, 29, 31, 13, 27, 39, 21, 18, 19, 13, 11, 13, 23, …
## $ WOR <dbl> 14, 15, 17, 6, 17, 12, 13, 13, 9, 14, 15, 6, 21, 8, 15, …
## $ WDR <dbl> 24, 28, 26, 19, 22, 24, 18, 35, 22, 22, 29, 21, 23, 35, …
## $ WAst <dbl> 13, 16, 15, 11, 12, 12, 14, 19, 9, 11, 21, 18, 15, 18, 2…
## $ WTO <dbl> 23, 13, 10, 12, 14, 9, 17, 19, 17, 6, 11, 15, 17, 13, 16…
## $ WStl <dbl> 7, 4, 5, 14, 4, 9, 11, 7, 9, 12, 10, 7, 12, 14, 18, 2, 5…
## $ WBlk <dbl> 1, 4, 2, 2, 4, 3, 1, 2, 2, 0, 6, 1, 3, 19, 5, 6, 3, 5, 1…
## $ WPF <dbl> 22, 18, 25, 18, 20, 18, 25, 21, 23, 13, 16, 5, 17, 13, 2…
## $ LFGM <dbl> 22, 24, 22, 18, 24, 20, 19, 20, 24, 23, 17, 29, 24, 18, …
## $ LFGA <dbl> 53, 67, 73, 49, 62, 46, 41, 59, 52, 52, 52, 63, 56, 64, …
## $ LFGM3 <dbl> 2, 6, 3, 6, 6, 3, 4, 4, 6, 3, 4, 10, 6, 8, 4, 7, 2, 2, 3…
## $ LFGA3 <dbl> 10, 24, 26, 22, 16, 11, 15, 17, 18, 14, 11, 22, 19, 24, …
## $ LFTM <dbl> 16, 9, 14, 8, 17, 12, 20, 17, 12, 7, 12, 5, 11, 4, 17, 1…
## $ LFTA <dbl> 22, 20, 23, 15, 27, 17, 28, 23, 27, 12, 17, 5, 17, 8, 27…
## $ LOR <dbl> 10, 20, 31, 17, 21, 6, 9, 8, 13, 9, 8, 13, 14, 14, 20, 6…
## $ LDR <dbl> 22, 25, 22, 20, 15, 22, 21, 25, 26, 23, 15, 16, 21, 26, …
## $ LAst <dbl> 8, 7, 9, 9, 12, 8, 11, 10, 13, 10, 8, 15, 17, 12, 17, 21…
## $ LTO <dbl> 18, 12, 12, 19, 10, 19, 30, 15, 25, 18, 17, 12, 18, 17, …
## $ LStl <dbl> 9, 8, 2, 4, 7, 4, 10, 14, 8, 1, 7, 6, 8, 10, 7, 5, 15, 8…
## $ LBlk <dbl> 2, 6, 5, 3, 1, 3, 4, 8, 2, 3, 3, 2, 4, 0, 7, 1, 3, 4, 5,…
## $ LPF <dbl> 20, 16, 23, 23, 14, 25, 28, 18, 18, 18, 15, 12, 13, 17, …
We will also need to load the Teams.csv dataset to append actual team names rather than IDs.
df_teams <- read_csv("data/Teams.csv")
## Parsed with column specification:
## cols(
## TeamID = col_double(),
## TeamName = col_character(),
## FirstD1Season = col_double(),
## LastD1Season = col_double()
## )
glimpse(df_teams)
## Observations: 366
## Variables: 4
## $ TeamID <dbl> 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 11…
## $ TeamName <chr> "Abilene Chr", "Air Force", "Akron", "Alabama", "A…
## $ FirstD1Season <dbl> 2014, 1985, 1985, 1985, 2000, 1985, 2000, 1985, 19…
## $ LastD1Season <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 19…
# select only the TeamID and TeamName vars
df_teams <- df_teams %>% select(TeamID, TeamName)
# create df for win and loss TeamID and TeamNames to append to regular season df
df_wteams <- df_teams
colnames(df_wteams) <- c("WTeamID", "WTeamName")
df_lteams <- df_teams
colnames(df_lteams) <- c("LTeamID", "LTeamName")
# add actual names using inner_join
# remove TeamID
df_regular_season <- df_regular_season %>%
inner_join(., df_wteams, by = "WTeamID") %>%
inner_join(., df_lteams, by = "LTeamID") %>%
select(-contains("TeamID"))
glimpse(df_regular_season)
## Observations: 82,041
## Variables: 34
## $ Season <dbl> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, …
## $ DayNum <dbl> 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13…
## $ WScore <dbl> 68, 70, 73, 56, 77, 81, 80, 75, 71, 84, 106, 74, 66, 7…
## $ LScore <dbl> 62, 63, 61, 50, 71, 55, 62, 61, 66, 56, 50, 73, 65, 48…
## $ WLoc <chr> "N", "N", "N", "N", "N", "H", "H", "N", "N", "H", "H",…
## $ NumOT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ WFGM <dbl> 27, 26, 24, 18, 30, 26, 23, 28, 28, 32, 41, 29, 26, 25…
## $ WFGA <dbl> 58, 62, 58, 38, 61, 57, 55, 62, 58, 67, 69, 51, 66, 56…
## $ WFGM3 <dbl> 3, 8, 8, 3, 6, 6, 2, 4, 5, 5, 15, 7, 5, 10, 11, 10, 5,…
## $ WFGA3 <dbl> 14, 20, 18, 9, 14, 12, 8, 14, 11, 17, 25, 13, 19, 23, …
## $ WFTM <dbl> 11, 10, 17, 17, 11, 23, 32, 15, 10, 15, 9, 9, 9, 16, 1…
## $ WFTA <dbl> 18, 19, 29, 31, 13, 27, 39, 21, 18, 19, 13, 11, 13, 23…
## $ WOR <dbl> 14, 15, 17, 6, 17, 12, 13, 13, 9, 14, 15, 6, 21, 8, 15…
## $ WDR <dbl> 24, 28, 26, 19, 22, 24, 18, 35, 22, 22, 29, 21, 23, 35…
## $ WAst <dbl> 13, 16, 15, 11, 12, 12, 14, 19, 9, 11, 21, 18, 15, 18,…
## $ WTO <dbl> 23, 13, 10, 12, 14, 9, 17, 19, 17, 6, 11, 15, 17, 13, …
## $ WStl <dbl> 7, 4, 5, 14, 4, 9, 11, 7, 9, 12, 10, 7, 12, 14, 18, 2,…
## $ WBlk <dbl> 1, 4, 2, 2, 4, 3, 1, 2, 2, 0, 6, 1, 3, 19, 5, 6, 3, 5,…
## $ WPF <dbl> 22, 18, 25, 18, 20, 18, 25, 21, 23, 13, 16, 5, 17, 13,…
## $ LFGM <dbl> 22, 24, 22, 18, 24, 20, 19, 20, 24, 23, 17, 29, 24, 18…
## $ LFGA <dbl> 53, 67, 73, 49, 62, 46, 41, 59, 52, 52, 52, 63, 56, 64…
## $ LFGM3 <dbl> 2, 6, 3, 6, 6, 3, 4, 4, 6, 3, 4, 10, 6, 8, 4, 7, 2, 2,…
## $ LFGA3 <dbl> 10, 24, 26, 22, 16, 11, 15, 17, 18, 14, 11, 22, 19, 24…
## $ LFTM <dbl> 16, 9, 14, 8, 17, 12, 20, 17, 12, 7, 12, 5, 11, 4, 17,…
## $ LFTA <dbl> 22, 20, 23, 15, 27, 17, 28, 23, 27, 12, 17, 5, 17, 8, …
## $ LOR <dbl> 10, 20, 31, 17, 21, 6, 9, 8, 13, 9, 8, 13, 14, 14, 20,…
## $ LDR <dbl> 22, 25, 22, 20, 15, 22, 21, 25, 26, 23, 15, 16, 21, 26…
## $ LAst <dbl> 8, 7, 9, 9, 12, 8, 11, 10, 13, 10, 8, 15, 17, 12, 17, …
## $ LTO <dbl> 18, 12, 12, 19, 10, 19, 30, 15, 25, 18, 17, 12, 18, 17…
## $ LStl <dbl> 9, 8, 2, 4, 7, 4, 10, 14, 8, 1, 7, 6, 8, 10, 7, 5, 15,…
## $ LBlk <dbl> 2, 6, 5, 3, 1, 3, 4, 8, 2, 3, 3, 2, 4, 0, 7, 1, 3, 4, …
## $ LPF <dbl> 20, 16, 23, 23, 14, 25, 28, 18, 18, 18, 15, 12, 13, 17…
## $ WTeamName <chr> "Alabama", "Memphis", "Marquette", "N Illinois", "Texa…
## $ LTeamName <chr> "Oklahoma", "Syracuse", "Villanova", "Winthrop", "Geor…
Let’s break apart the tables into wins and losses so that we can manipulate the same variables at the same time.
df_regular_season_win <- df_regular_season %>%
select(Season, DayNum, contains("W"))
colnames(df_regular_season_win) <- gsub("W", "", colnames(df_regular_season_win))
df_regular_season_loss <- df_regular_season %>%
select(Season, DayNum, contains("L"))
colnames(df_regular_season_loss) <- gsub("L", "", colnames(df_regular_season_loss))
# data manipulation ----
# create new variables that are also easy to read
data_manipulation_reg_season <- function(data) {
data %>%
select(season = Season,
n_day = DayNum,
team_name = TeamName,
score = Score,
points_made = FGM,
points_attempted = FGA,
points_3_made = FGM3,
points_3_attempted = FGA3,
points_free_throws_made = FTM,
points_free_throws_attempted = FTA,
rebound_offensive = OR,
rebound_defensive = DR,
assist = Ast,
turnovers = TO,
steals = Stl,
blocks = Blk,
personal_fouls = PF) %>%
mutate(index = 1:nrow(data),
points_2_made = points_made - points_3_made,
points_2_attempted = points_attempted - points_3_attempted,
points_missed = points_attempted - points_made,
points_2_missed = points_2_attempted - points_2_made,
points_3_missed = points_3_attempted - points_3_made,
points_free_throws_missed = points_free_throws_attempted - points_free_throws_made,
points_made_percentage = points_made / points_attempted * 100,
points_2_made_percentage = points_2_made / points_2_attempted * 100,
points_3_made_percentage = points_3_made / points_3_attempted * 100,
points_free_throws_made_percentage = points_free_throws_made / points_free_throws_attempted * 100)
}
df_regular_season_win <- data_manipulation_reg_season(df_regular_season_win) %>% mutate(outcome = "win")
glimpse(df_regular_season_win)
## Observations: 82,041
## Variables: 29
## $ season <dbl> 2003, 2003, 2003, 2003, 2003,…
## $ n_day <dbl> 10, 10, 11, 11, 11, 11, 12, 1…
## $ team_name <chr> "Alabama", "Memphis", "Marque…
## $ score <dbl> 68, 70, 73, 56, 77, 81, 80, 7…
## $ points_made <dbl> 27, 26, 24, 18, 30, 26, 23, 2…
## $ points_attempted <dbl> 58, 62, 58, 38, 61, 57, 55, 6…
## $ points_3_made <dbl> 3, 8, 8, 3, 6, 6, 2, 4, 5, 5,…
## $ points_3_attempted <dbl> 14, 20, 18, 9, 14, 12, 8, 14,…
## $ points_free_throws_made <dbl> 11, 10, 17, 17, 11, 23, 32, 1…
## $ points_free_throws_attempted <dbl> 18, 19, 29, 31, 13, 27, 39, 2…
## $ rebound_offensive <dbl> 14, 15, 17, 6, 17, 12, 13, 13…
## $ rebound_defensive <dbl> 24, 28, 26, 19, 22, 24, 18, 3…
## $ assist <dbl> 13, 16, 15, 11, 12, 12, 14, 1…
## $ turnovers <dbl> 23, 13, 10, 12, 14, 9, 17, 19…
## $ steals <dbl> 7, 4, 5, 14, 4, 9, 11, 7, 9, …
## $ blocks <dbl> 1, 4, 2, 2, 4, 3, 1, 2, 2, 0,…
## $ personal_fouls <dbl> 22, 18, 25, 18, 20, 18, 25, 2…
## $ index <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10…
## $ points_2_made <dbl> 24, 18, 16, 15, 24, 20, 21, 2…
## $ points_2_attempted <dbl> 44, 42, 40, 29, 47, 45, 47, 4…
## $ points_missed <dbl> 31, 36, 34, 20, 31, 31, 32, 3…
## $ points_2_missed <dbl> 20, 24, 24, 14, 23, 25, 26, 2…
## $ points_3_missed <dbl> 11, 12, 10, 6, 8, 6, 6, 10, 6…
## $ points_free_throws_missed <dbl> 7, 9, 12, 14, 2, 4, 7, 6, 8, …
## $ points_made_percentage <dbl> 46.55172, 41.93548, 41.37931,…
## $ points_2_made_percentage <dbl> 54.54545, 42.85714, 40.00000,…
## $ points_3_made_percentage <dbl> 21.42857, 40.00000, 44.44444,…
## $ points_free_throws_made_percentage <dbl> 61.11111, 52.63158, 58.62069,…
## $ outcome <chr> "win", "win", "win", "win", "…
df_regular_season_loss <- data_manipulation_reg_season(df_regular_season_loss) %>% mutate(outcome = "loss")
glimpse(df_regular_season_loss)
## Observations: 82,041
## Variables: 29
## $ season <dbl> 2003, 2003, 2003, 2003, 2003,…
## $ n_day <dbl> 10, 10, 11, 11, 11, 11, 12, 1…
## $ team_name <chr> "Oklahoma", "Syracuse", "Vill…
## $ score <dbl> 62, 63, 61, 50, 71, 55, 62, 6…
## $ points_made <dbl> 22, 24, 22, 18, 24, 20, 19, 2…
## $ points_attempted <dbl> 53, 67, 73, 49, 62, 46, 41, 5…
## $ points_3_made <dbl> 2, 6, 3, 6, 6, 3, 4, 4, 6, 3,…
## $ points_3_attempted <dbl> 10, 24, 26, 22, 16, 11, 15, 1…
## $ points_free_throws_made <dbl> 16, 9, 14, 8, 17, 12, 20, 17,…
## $ points_free_throws_attempted <dbl> 22, 20, 23, 15, 27, 17, 28, 2…
## $ rebound_offensive <dbl> 10, 20, 31, 17, 21, 6, 9, 8, …
## $ rebound_defensive <dbl> 22, 25, 22, 20, 15, 22, 21, 2…
## $ assist <dbl> 8, 7, 9, 9, 12, 8, 11, 10, 13…
## $ turnovers <dbl> 18, 12, 12, 19, 10, 19, 30, 1…
## $ steals <dbl> 9, 8, 2, 4, 7, 4, 10, 14, 8, …
## $ blocks <dbl> 2, 6, 5, 3, 1, 3, 4, 8, 2, 3,…
## $ personal_fouls <dbl> 20, 16, 23, 23, 14, 25, 28, 1…
## $ index <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10…
## $ points_2_made <dbl> 20, 18, 19, 12, 18, 17, 15, 1…
## $ points_2_attempted <dbl> 43, 43, 47, 27, 46, 35, 26, 4…
## $ points_missed <dbl> 31, 43, 51, 31, 38, 26, 22, 3…
## $ points_2_missed <dbl> 23, 25, 28, 15, 28, 18, 11, 2…
## $ points_3_missed <dbl> 8, 18, 23, 16, 10, 8, 11, 13,…
## $ points_free_throws_missed <dbl> 6, 11, 9, 7, 10, 5, 8, 6, 15,…
## $ points_made_percentage <dbl> 41.50943, 35.82090, 30.13699,…
## $ points_2_made_percentage <dbl> 46.51163, 41.86047, 40.42553,…
## $ points_3_made_percentage <dbl> 20.00000, 25.00000, 11.53846,…
## $ points_free_throws_made_percentage <dbl> 72.72727, 45.00000, 60.86957,…
## $ outcome <chr> "loss", "loss", "loss", "loss…
df_regular_season_all <- bind_rows(df_regular_season_win, df_regular_season_loss) %>% mutate(outcome = as_factor(outcome))
glimpse(df_regular_season_all)
## Observations: 164,082
## Variables: 29
## $ season <dbl> 2003, 2003, 2003, 2003, 2003,…
## $ n_day <dbl> 10, 10, 11, 11, 11, 11, 12, 1…
## $ team_name <chr> "Alabama", "Memphis", "Marque…
## $ score <dbl> 68, 70, 73, 56, 77, 81, 80, 7…
## $ points_made <dbl> 27, 26, 24, 18, 30, 26, 23, 2…
## $ points_attempted <dbl> 58, 62, 58, 38, 61, 57, 55, 6…
## $ points_3_made <dbl> 3, 8, 8, 3, 6, 6, 2, 4, 5, 5,…
## $ points_3_attempted <dbl> 14, 20, 18, 9, 14, 12, 8, 14,…
## $ points_free_throws_made <dbl> 11, 10, 17, 17, 11, 23, 32, 1…
## $ points_free_throws_attempted <dbl> 18, 19, 29, 31, 13, 27, 39, 2…
## $ rebound_offensive <dbl> 14, 15, 17, 6, 17, 12, 13, 13…
## $ rebound_defensive <dbl> 24, 28, 26, 19, 22, 24, 18, 3…
## $ assist <dbl> 13, 16, 15, 11, 12, 12, 14, 1…
## $ turnovers <dbl> 23, 13, 10, 12, 14, 9, 17, 19…
## $ steals <dbl> 7, 4, 5, 14, 4, 9, 11, 7, 9, …
## $ blocks <dbl> 1, 4, 2, 2, 4, 3, 1, 2, 2, 0,…
## $ personal_fouls <dbl> 22, 18, 25, 18, 20, 18, 25, 2…
## $ index <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10…
## $ points_2_made <dbl> 24, 18, 16, 15, 24, 20, 21, 2…
## $ points_2_attempted <dbl> 44, 42, 40, 29, 47, 45, 47, 4…
## $ points_missed <dbl> 31, 36, 34, 20, 31, 31, 32, 3…
## $ points_2_missed <dbl> 20, 24, 24, 14, 23, 25, 26, 2…
## $ points_3_missed <dbl> 11, 12, 10, 6, 8, 6, 6, 10, 6…
## $ points_free_throws_missed <dbl> 7, 9, 12, 14, 2, 4, 7, 6, 8, …
## $ points_made_percentage <dbl> 46.55172, 41.93548, 41.37931,…
## $ points_2_made_percentage <dbl> 54.54545, 42.85714, 40.00000,…
## $ points_3_made_percentage <dbl> 21.42857, 40.00000, 44.44444,…
## $ points_free_throws_made_percentage <dbl> 61.11111, 52.63158, 58.62069,…
## $ outcome <fct> win, win, win, win, win, win,…
Now let’s compare some winning and losing stats.
df_regular_season_all %>%
group_by(team_name, outcome) %>%
count() %>%
ungroup() %>%
filter(outcome == "win") %>%
top_n(10) %>%
arrange(desc(n)) %>%
mutate(team_name = factor(team_name, levels = team_name)) %>%
ggplot(., aes(x = team_name, y = n, fill = n)) +
geom_col() +
scale_fill_distiller(palette = "Blues", direction = -1) +
theme_minimal() +
theme(axis.text.x = element_text(hjust = 1, angle = 45)) +
labs(title = "Most Wins During Regular Season",
x = "\nTeam",
y = "Frequency\n",
fill = "Frequency\n")
## Selecting by n
reg_season_metrics <- df_regular_season_all %>%
select(-c(season, n_day, team_name, index, outcome)) %>%
colnames()
for (metric in reg_season_metrics) {
figure <- df_regular_season_all %>%
ggplot(., aes(x = eval(as.name((metric))), fill = outcome)) +
geom_density(alpha = 0.5, color = 0) +
theme_minimal() +
labs(x = paste0("\n",metric),
y = "Density\n")
print(figure)
}
## Warning: Removed 36 rows containing non-finite values (stat_density).
It looks like the losing team shoots slightly more, but misses even more.
It looks like the winning team takes less shots, misses less, has more offensive rebounds, more assists, less turnovers, more steals, more blocks, and less personal fouls.
df_most_wins <- df_regular_season_all %>%
group_by(season, team_name, outcome) %>%
count() %>%
ungroup() %>%
filter(outcome == "win") %>%
group_by(season) %>%
top_n(10) %>%
ungroup() %>%
arrange(season, desc(n)) %>%
mutate(order = row_number())
## Selecting by n
ggplot(df_most_wins, aes(x = order, y = n, fill = n)) +
geom_col() +
scale_fill_distiller(palette = "Blues", direction = -1) +
theme_minimal() +
theme(axis.text.x = element_text(hjust = 1, angle = 45)) +
labs(title = "Most Wins During Regular Season",
x = "\nTeam",
y = "Frequency\n",
fill = "Frequency\n") +
facet_wrap(~ season, scales = "free") +
scale_x_continuous(breaks = df_most_wins$order,
labels = df_most_wins$team_name,
expand = c(0,0))
for (metric in reg_season_metrics) {
figure <- df_regular_season_all %>%
ggplot(., aes(x = eval(as.name((metric))), fill = outcome)) +
geom_density(alpha = 0.5, color = 0) +
theme_minimal() +
labs(x = paste0("\n",metric),
y = "Density\n") +
facet_wrap(~ season)
print(figure)
}
## Warning: Removed 36 rows containing non-finite values (stat_density).